In [1]:
## 処理設定
verbose = False
check = False
### 言語の割合の均等化
balanced = True
### LDA 用
## トピック数
n_topics = 15 # 30は多過ぎる?
## doc, term の設定
doc_type = 'form' # 変更不可
doc_attr = 'sound' # spell か sound の選択
max_doc_size = 12
##
term_size = 'character'
term_type = 'skippy3gram'
## skippy n-gram の結合範囲
max_distance_val = round(max_doc_size * 0.8)
print(f"max_distance_val: {max_distance_val}")
## ngram を包括的にするかどうか
ngram_is_inclusive = True
### DTM 構築
## term の最低頻度
term_min_freq = 2
## 高頻度 term の濫用指標: 大きくし過ぎないように.0.05 は十分に大きい
term_abuse_threshold = 0.04
max_distance_val: 10
In [2]:
## 言語の選別
##音声解析で使える言語
## arabic, english, french, german, icelandic, spanish, swahili
##綴り解析で使える言語
## arabic, bengali, chinese, esperanto, english, finnish, french, german, greek,
## hebrew, hungarian, icelandic, irish, italian, japanese, russian, spanish, swahili
## turkish
import re
select_languages = True
target_lang_names = "arabic, bengali, chinese, english, french, german, \
greek, hungarian, icelandic, russian, japanese, turkish"
target_lang_names = "arabic, english, french, icelandic, spanish, swahili"
selected_langs = re.split(r",\s*", target_lang_names)
print(f"selected languages: {selected_langs}")
selected languages: ['arabic', 'english', 'french', 'icelandic', 'spanish', 'swahili']
In [3]:
import sys, os, random, re, glob
import pandas as pd
import pprint as pp
from functools import reduce
In [4]:
## load data to process
from pathlib import Path
import pprint as pp
wd = Path(".")
dirs = [ x for x in wd.iterdir() if x.is_dir() and not x.match(r"plot*") ]
if verbose:
print(f"The following {len(dirs)} directories are potential targets:")
pp.pprint(dirs)
## list up files in target directory
wd = Path(".")
target_dir = "data-words" # can be changed
target_files = sorted(list(wd.glob(f"{target_dir}/*.csv")))
#
print(f"\n{target_dir} contains {len(target_files)} files to process")
pp.pprint(target_files)
data-words contains 26 files to process
[PosixPath('data-words/base-sound-Arabic-r0-1k-mc.csv'),
PosixPath('data-words/base-sound-English-r6e-original.csv'),
PosixPath('data-words/base-sound-French-r0-1k-mc.csv'),
PosixPath('data-words/base-sound-French-r0-opendic-s900.csv'),
PosixPath('data-words/base-sound-German-r1a-original.csv'),
PosixPath('data-words/base-sound-Icelandic-r0-1k-mc.csv'),
PosixPath('data-words/base-sound-Spanish-r0-1k-mc.csv'),
PosixPath('data-words/base-sound-Swahili-r0-1k-mc.csv'),
PosixPath('data-words/base-spell-Arabic-r0-1k-mc.csv'),
PosixPath('data-words/base-spell-Chinese-r0-1k-mc.csv'),
PosixPath('data-words/base-spell-English-r6e-original.csv'),
PosixPath('data-words/base-spell-Esperanto-r0-orginal.csv'),
PosixPath('data-words/base-spell-Finnish-r0-1k-mc.csv'),
PosixPath('data-words/base-spell-French-r0-1k-mc.csv'),
PosixPath('data-words/base-spell-German-r1a-original.csv'),
PosixPath('data-words/base-spell-Greek-r0-1k-mc.csv'),
PosixPath('data-words/base-spell-Hebrew-r0-1k-mc.csv'),
PosixPath('data-words/base-spell-Hungarian-r0-1k-mc.csv'),
PosixPath('data-words/base-spell-Icelandic-r0-original.csv'),
PosixPath('data-words/base-spell-Irish-r0-1k-mc.csv'),
PosixPath('data-words/base-spell-Italian-r0-1k-mc.csv'),
PosixPath('data-words/base-spell-Japanese-r0-1k-mc.csv'),
PosixPath('data-words/base-spell-Russian-r0-1k-mc.csv'),
PosixPath('data-words/base-spell-Spanish-r0-1k-mc.csv'),
PosixPath('data-words/base-spell-Swahili-r0-1k-mc.csv'),
PosixPath('data-words/base-spell-Turkish-r0-1k-mc.csv')]
In [5]:
import pandas as pd
## データ型の辞書
types = re.split(r",\s+", "spell, sound")
type_settings = { t : 0 for t in types }
print(type_settings)
## 言語名の辞書
lang_name_list = "arabic, bengali, chinese, english, esperanto, finnish, french, \
greek, galician, german, hungarian, icelandic, irish, italian, japanese, russian, spanish, swahili, turkish"
langs = re.split(r",\s*", lang_name_list)
lang_settings = { lang : 0 for lang in langs }
print(f"{len(lang_settings.keys())} langs are available")
print(lang_settings)
## 辞書と統合
settings = { **type_settings, **lang_settings }
print(settings)
{'spell': 0, 'sound': 0}
19 langs are available
{'arabic': 0, 'bengali': 0, 'chinese': 0, 'english': 0, 'esperanto': 0, 'finnish': 0, 'french': 0, 'greek': 0, 'galician': 0, 'german': 0, 'hungarian': 0, 'icelandic': 0, 'irish': 0, 'italian': 0, 'japanese': 0, 'russian': 0, 'spanish': 0, 'swahili': 0, 'turkish': 0}
{'spell': 0, 'sound': 0, 'arabic': 0, 'bengali': 0, 'chinese': 0, 'english': 0, 'esperanto': 0, 'finnish': 0, 'french': 0, 'greek': 0, 'galician': 0, 'german': 0, 'hungarian': 0, 'icelandic': 0, 'irish': 0, 'italian': 0, 'japanese': 0, 'russian': 0, 'spanish': 0, 'swahili': 0, 'turkish': 0}
In [6]:
check = False
setting_keys = list(settings.keys())
print(f"target setting_keys: {setting_keys}")
d_parts = [ ]
for lang in lang_settings.keys():
local_settings = settings.copy()
if check:
print(f"processing: {lang}")
try:
## ファイル単位で処理
for f in [ f for f in target_files if lang.capitalize() in str(f) ]:
print(f"reading: {f}")
# 言語名の指定
local_settings[lang] = 1
# 型名の指定
for key in type_settings.keys():
if key in str(f):
local_settings[key] = 1
else:
local_settings[key] = 0 # この変更を見落とさないように
# ファイル処理
try:
data = pd.read_csv(f, encoding = 'utf-8', sep = ",", on_bad_lines = 'skip') # Crucially, ...= skip
if check:
print(data)
#dfx = pd.DataFrame(data, columns = setting_keys)
dfx = pd.DataFrame(data, columns = ['form', 'freq'])
for key in settings.keys():
dfx[key] = local_settings[key]
if check:
print(dfx)
d_parts.append(dfx)
except FileNotFoundError:
pass
except IndexError:
pass
#
if verbose:
d_parts
target setting_keys: ['spell', 'sound', 'arabic', 'bengali', 'chinese', 'english', 'esperanto', 'finnish', 'french', 'greek', 'galician', 'german', 'hungarian', 'icelandic', 'irish', 'italian', 'japanese', 'russian', 'spanish', 'swahili', 'turkish'] reading: data-words/base-sound-Arabic-r0-1k-mc.csv reading: data-words/base-spell-Arabic-r0-1k-mc.csv reading: data-words/base-spell-Chinese-r0-1k-mc.csv reading: data-words/base-sound-English-r6e-original.csv reading: data-words/base-spell-English-r6e-original.csv reading: data-words/base-spell-Esperanto-r0-orginal.csv reading: data-words/base-spell-Finnish-r0-1k-mc.csv reading: data-words/base-sound-French-r0-1k-mc.csv reading: data-words/base-sound-French-r0-opendic-s900.csv reading: data-words/base-spell-French-r0-1k-mc.csv reading: data-words/base-spell-Greek-r0-1k-mc.csv reading: data-words/base-sound-German-r1a-original.csv reading: data-words/base-spell-German-r1a-original.csv reading: data-words/base-spell-Hungarian-r0-1k-mc.csv reading: data-words/base-sound-Icelandic-r0-1k-mc.csv reading: data-words/base-spell-Icelandic-r0-original.csv reading: data-words/base-spell-Irish-r0-1k-mc.csv reading: data-words/base-spell-Italian-r0-1k-mc.csv reading: data-words/base-spell-Japanese-r0-1k-mc.csv reading: data-words/base-spell-Russian-r0-1k-mc.csv reading: data-words/base-sound-Spanish-r0-1k-mc.csv reading: data-words/base-spell-Spanish-r0-1k-mc.csv reading: data-words/base-sound-Swahili-r0-1k-mc.csv reading: data-words/base-spell-Swahili-r0-1k-mc.csv reading: data-words/base-spell-Turkish-r0-1k-mc.csv
In [7]:
## データ統合
raw_df = pd.concat(d_parts)
raw_df
Out[7]:
| form | freq | spell | sound | arabic | bengali | chinese | english | esperanto | finnish | ... | german | hungarian | icelandic | irish | italian | japanese | russian | spanish | swahili | turkish | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | kamaː | 1 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 1 | ʔanaː | 1 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 2 | lahu | 1 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 3 | ʔan | 1 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 4 | huːa | 1 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 994 | burun | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
| 995 | çoğul | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
| 996 | öfke | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
| 997 | iddia | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
| 998 | kıta | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
28858 rows × 23 columns
In [8]:
## 言語名= language の列を追加
check = False
language_vals = [ ]
for i, row in raw_df.iterrows():
if check:
print(row)
for j, lang in enumerate(langs):
if check:
print(f"{i}: {lang}")
if row[lang] == 1:
language_vals.append(lang)
if verbose:
print(language_vals)
len(language_vals)
#
raw_df['language'] = language_vals
raw_df
Out[8]:
| form | freq | spell | sound | arabic | bengali | chinese | english | esperanto | finnish | ... | hungarian | icelandic | irish | italian | japanese | russian | spanish | swahili | turkish | language | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | kamaː | 1 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | arabic |
| 1 | ʔanaː | 1 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | arabic |
| 2 | lahu | 1 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | arabic |
| 3 | ʔan | 1 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | arabic |
| 4 | huːa | 1 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | arabic |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 994 | burun | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | turkish |
| 995 | çoğul | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | turkish |
| 996 | öfke | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | turkish |
| 997 | iddia | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | turkish |
| 998 | kıta | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | turkish |
28858 rows × 24 columns
In [9]:
## 文字数の列を追加
raw_df['size'] = [ len(x) for x in raw_df[doc_type] ]
raw_df
Out[9]:
| form | freq | spell | sound | arabic | bengali | chinese | english | esperanto | finnish | ... | icelandic | irish | italian | japanese | russian | spanish | swahili | turkish | language | size | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | kamaː | 1 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | arabic | 5 |
| 1 | ʔanaː | 1 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | arabic | 5 |
| 2 | lahu | 1 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | arabic | 4 |
| 3 | ʔan | 1 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | arabic | 3 |
| 4 | huːa | 1 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | arabic | 4 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 994 | burun | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | turkish | 5 |
| 995 | çoğul | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | turkish | 5 |
| 996 | öfke | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | turkish | 4 |
| 997 | iddia | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | turkish | 5 |
| 998 | kıta | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | turkish | 4 |
28858 rows × 25 columns
In [10]:
## 解析データの型指定
print(f"doc_attr: {doc_attr}")
raw_df = raw_df[raw_df[doc_attr] == 1]
raw_df
doc_attr: sound
Out[10]:
| form | freq | spell | sound | arabic | bengali | chinese | english | esperanto | finnish | ... | icelandic | irish | italian | japanese | russian | spanish | swahili | turkish | language | size | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | kamaː | 1 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | arabic | 5 |
| 1 | ʔanaː | 1 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | arabic | 5 |
| 2 | lahu | 1 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | arabic | 4 |
| 3 | ʔan | 1 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | arabic | 3 |
| 4 | huːa | 1 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | arabic | 4 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 878 | pua | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | swahili | 3 |
| 879 | wiᵑgi | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | swahili | 5 |
| 880 | hasiɾa | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | swahili | 6 |
| 881 | maɗai | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | swahili | 5 |
| 882 | ɓaɾa | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | swahili | 4 |
9536 rows × 25 columns
In [11]:
## 言語の選別
if select_languages:
df_new = [ ]
for lang in selected_langs:
df_new.append(raw_df[raw_df[lang] == 1])
raw_df = pd.concat(df_new)
#
raw_df
Out[11]:
| form | freq | spell | sound | arabic | bengali | chinese | english | esperanto | finnish | ... | icelandic | irish | italian | japanese | russian | spanish | swahili | turkish | language | size | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | kamaː | 1 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | arabic | 5 |
| 1 | ʔanaː | 1 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | arabic | 5 |
| 2 | lahu | 1 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | arabic | 4 |
| 3 | ʔan | 1 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | arabic | 3 |
| 4 | huːa | 1 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | arabic | 4 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 878 | pua | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | swahili | 3 |
| 879 | wiᵑgi | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | swahili | 5 |
| 880 | hasiɾa | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | swahili | 6 |
| 881 | maɗai | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | swahili | 5 |
| 882 | ɓaɾa | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | swahili | 4 |
8740 rows × 25 columns
In [12]:
## 文字数の分布
import numpy as np
import matplotlib.pyplot as plt
fig = plt.figure()
ax = fig.add_subplot(1,1,1)
ax.hist(raw_df['size'], bins = 40)
ax.set_xlabel('length of doc')
ax.set_ylabel('freq')
plt.title(f"Length distribution for docs")
fig.show()
/var/folders/7s/_syxn0dd45lcngw4yl2kywd40000gn/T/ipykernel_37910/1088473461.py:12: UserWarning: FigureCanvasAgg is non-interactive, and thus cannot be shown fig.show()
In [13]:
## 長さで濾過
print(f"max doc size: {max_doc_size}")
original_size = len(raw_df)
raw_df = raw_df[raw_df['size'] < max_doc_size]
filtered_size = len(raw_df)
print(f"{original_size - filtered_size} cases removed")
max doc size: 12 197 cases removed
In [14]:
## 結果の検査 1
for type in types:
print(raw_df[type].value_counts())
spell 0 8543 Name: count, dtype: int64 sound 1 8543 Name: count, dtype: int64
In [15]:
## 結果の検査 2
for lang in langs:
print(raw_df[lang].value_counts())
arabic 0 7819 1 724 Name: count, dtype: int64 bengali 0 8543 Name: count, dtype: int64 chinese 0 8543 Name: count, dtype: int64 english 0 4442 1 4101 Name: count, dtype: int64 esperanto 0 8543 Name: count, dtype: int64 finnish 0 8543 Name: count, dtype: int64 french 0 7584 1 959 Name: count, dtype: int64 greek 0 8543 Name: count, dtype: int64 galician 0 8543 Name: count, dtype: int64 german 0 8543 Name: count, dtype: int64 hungarian 0 8543 Name: count, dtype: int64 icelandic 0 7614 1 929 Name: count, dtype: int64 irish 0 8543 Name: count, dtype: int64 italian 0 8543 Name: count, dtype: int64 japanese 0 8543 Name: count, dtype: int64 russian 0 8543 Name: count, dtype: int64 spanish 0 7594 1 949 Name: count, dtype: int64 swahili 0 7662 1 881 Name: count, dtype: int64 turkish 0 8543 Name: count, dtype: int64
In [16]:
## 統合: 割合補正を適用
eng_reduct_factor = 0.2
if balanced:
eng_df = raw_df[raw_df['english'] == 1]
non_eng_df = raw_df[raw_df['english'] == 0]
eng_reduced_df = eng_df.sample(round(len(eng_df) * eng_reduct_factor))
raw_df = pd.concat([eng_reduced_df, non_eng_df])
raw_df
Out[16]:
| form | freq | spell | sound | arabic | bengali | chinese | english | esperanto | finnish | ... | icelandic | irish | italian | japanese | russian | spanish | swahili | turkish | language | size | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 343 | bɹeɪvəɹɪ | 1 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | english | 8 |
| 2302 | saʊndz | 1 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | english | 6 |
| 2529 | swip | 1 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | english | 4 |
| 2546 | səksɛs | 1 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | english | 6 |
| 1905 | nɛp | 1 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | english | 3 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 878 | pua | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | swahili | 3 |
| 879 | wiᵑgi | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | swahili | 5 |
| 880 | hasiɾa | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | swahili | 6 |
| 881 | maɗai | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | swahili | 5 |
| 882 | ɓaɾa | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | swahili | 4 |
5262 rows × 25 columns
In [17]:
## 結果の検査 3
for lang in langs:
print(raw_df[lang].value_counts())
arabic 0 4538 1 724 Name: count, dtype: int64 bengali 0 5262 Name: count, dtype: int64 chinese 0 5262 Name: count, dtype: int64 english 0 4442 1 820 Name: count, dtype: int64 esperanto 0 5262 Name: count, dtype: int64 finnish 0 5262 Name: count, dtype: int64 french 0 4303 1 959 Name: count, dtype: int64 greek 0 5262 Name: count, dtype: int64 galician 0 5262 Name: count, dtype: int64 german 0 5262 Name: count, dtype: int64 hungarian 0 5262 Name: count, dtype: int64 icelandic 0 4333 1 929 Name: count, dtype: int64 irish 0 5262 Name: count, dtype: int64 italian 0 5262 Name: count, dtype: int64 japanese 0 5262 Name: count, dtype: int64 russian 0 5262 Name: count, dtype: int64 spanish 0 4313 1 949 Name: count, dtype: int64 swahili 0 4381 1 881 Name: count, dtype: int64 turkish 0 5262 Name: count, dtype: int64
解析¶
In [18]:
## 順序のランダマイズし,基本データを決定
import sklearn.utils
df = sklearn.utils.shuffle(raw_df)
In [19]:
## ngram の追加
import sys
sys.path.append('..')
import re
import ngrams
import importlib
importlib.reload(ngrams)
import ngrams_skippy
bases = df[doc_type]
## 1gram 列の追加
#sep = r""
#unigrams = [ list(filter(lambda x: len(x) > 0, y)) for y in [ re.split(sep, z) for z in bases ] ]
unigrams = ngrams.gen_unigrams(bases, sep = r"", check = False)
if verbose:
random.sample(unigrams, 5)
#
df['1gram'] = unigrams
df
Out[19]:
| form | freq | spell | sound | arabic | bengali | chinese | english | esperanto | finnish | ... | irish | italian | japanese | russian | spanish | swahili | turkish | language | size | 1gram | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 785 | korjente | 1.0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 1 | 0 | 0 | spanish | 8 | [k, o, r, j, e, n, t, e] |
| 803 | pistola | 1.0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 1 | 0 | 0 | spanish | 7 | [p, i, s, t, o, l, a] |
| 2296 | saɪləns | 1 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | english | 7 | [s, a, ɪ, l, ə, n, s] |
| 833 | flɔkɔnɛʁ | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | french | 8 | [f, l, ɔ, k, ɔ, n, ɛ, ʁ] |
| 798 | aswafəʁɔ̃ | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | french | 9 | [a, s, w, a, f, ə, ʁ, ɔ, ̃] |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 2109 | pɛnɪ | 1 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | english | 4 | [p, ɛ, n, ɪ] |
| 527 | aːlmaʕaːdin | 1 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | arabic | 11 | [a, ː, l, m, a, ʕ, a, ː, d, i, n] |
| 32 | katika | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 1 | 0 | swahili | 6 | [k, a, t, i, k, a] |
| 643 | ɑ̃paʁat | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | french | 7 | [ɑ, ̃, p, a, ʁ, a, t] |
| 2905 | tʃɔkələts | 1 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | english | 9 | [t, ʃ, ɔ, k, ə, l, ə, t, s] |
5262 rows × 26 columns
In [20]:
## 2gram列の追加
bigrams = ngrams.gen_bigrams(bases, sep = r"", check = False)
## 包括的 2gram の作成
if ngram_is_inclusive:
bigrams = [ [*b, *u] for b, u in zip(bigrams, unigrams) ]
if verbose:
print(random.sample(bigrams, 3))
In [21]:
df['2gram'] = bigrams
if verbose:
df
In [22]:
## 3gram列の追加
trigrams = ngrams.gen_trigrams(bases, sep = r"", check = False)
## 包括的 3gram の作成
if ngram_is_inclusive:
trigrams = [ [ *t, *b ] for t, b in zip(trigrams, bigrams) ]
if verbose:
print(random.sample(trigrams, 3))
In [23]:
df['3gram'] = trigrams
if verbose:
df
In [24]:
## skippy 2grams の生成
import sys
sys.path.append("..") # library path に一つ上の階層を追加
import ngrams_skippy
skippy_2grams = [ ngrams_skippy.generate_skippy_bigrams(x,
missing_mark = '…',
max_distance = max_distance_val, check = False)
for x in df['1gram'] ]
## 包括的 skippy 2-grams の生成
if ngram_is_inclusive:
for i, b2 in enumerate(skippy_2grams):
b2.extend(unigrams[i])
#
if verbose:
random.sample(skippy_2grams, 3)
In [25]:
## skippy 2gram 列の追加
df['skippy2gram'] = skippy_2grams
df
Out[25]:
| form | freq | spell | sound | arabic | bengali | chinese | english | esperanto | finnish | ... | russian | spanish | swahili | turkish | language | size | 1gram | 2gram | 3gram | skippy2gram | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 785 | korjente | 1.0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 1 | 0 | 0 | spanish | 8 | [k, o, r, j, e, n, t, e] | [ko, or, rj, je, en, nt, te, k, o, r, j, e, n,... | [kor, orj, rje, jen, ent, nte, ko, or, rj, je,... | [ko, k…r, k…j, k…e, k…n, k…t, or, o…j, o…e, o…... |
| 803 | pistola | 1.0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 1 | 0 | 0 | spanish | 7 | [p, i, s, t, o, l, a] | [pi, is, st, to, ol, la, p, i, s, t, o, l, a] | [pis, ist, sto, tol, ola, pi, is, st, to, ol, ... | [pi, p…s, p…t, p…o, p…l, p…a, is, i…t, i…o, i…... |
| 2296 | saɪləns | 1 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | ... | 0 | 0 | 0 | 0 | english | 7 | [s, a, ɪ, l, ə, n, s] | [sa, aɪ, ɪl, lə, ən, ns, s, a, ɪ, l, ə, n, s] | [saɪ, aɪl, ɪlə, lən, əns, sa, aɪ, ɪl, lə, ən, ... | [sa, s…ɪ, s…l, s…ə, s…n, s…s, aɪ, a…l, a…ə, a…... |
| 833 | flɔkɔnɛʁ | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | french | 8 | [f, l, ɔ, k, ɔ, n, ɛ, ʁ] | [fl, lɔ, ɔk, kɔ, ɔn, nɛ, ɛʁ, f, l, ɔ, k, ɔ, n,... | [flɔ, lɔk, ɔkɔ, kɔn, ɔnɛ, nɛʁ, fl, lɔ, ɔk, kɔ,... | [fl, f…ɔ, f…k, f…n, f…ɛ, f…ʁ, lɔ, l…k, l…ɔ, l…... |
| 798 | aswafəʁɔ̃ | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | french | 9 | [a, s, w, a, f, ə, ʁ, ɔ, ̃] | [as, sw, wa, af, fə, əʁ, ʁɔ, ɔ̃, a, s, w, a, f... | [asw, swa, waf, afə, fəʁ, əʁɔ, ʁɔ̃, as, sw, wa... | [as, a…w, a…a, a…f, a…ə, a…ʁ, a…ɔ, a…̃, sw, s…... |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 2109 | pɛnɪ | 1 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | ... | 0 | 0 | 0 | 0 | english | 4 | [p, ɛ, n, ɪ] | [pɛ, ɛn, nɪ, p, ɛ, n, ɪ] | [pɛn, ɛnɪ, pɛ, ɛn, nɪ, p, ɛ, n, ɪ] | [pɛ, p…n, p…ɪ, ɛn, ɛ…ɪ, nɪ, p, ɛ, n, ɪ] |
| 527 | aːlmaʕaːdin | 1 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | arabic | 11 | [a, ː, l, m, a, ʕ, a, ː, d, i, n] | [aː, ːl, lm, ma, aʕ, ʕa, aː, ːd, di, in, a, ː,... | [aːl, ːlm, lma, maʕ, aʕa, ʕaː, aːd, ːdi, din, ... | [aː, a…l, a…m, a…a, a…ʕ, a…ː, a…d, a…i, a…n, ː... |
| 32 | katika | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 1 | 0 | swahili | 6 | [k, a, t, i, k, a] | [ka, at, ti, ik, ka, k, a, t, i, k, a] | [kat, ati, tik, ika, ka, at, ti, ik, ka, k, a,... | [ka, k…t, k…i, k…k, k…a, at, a…i, a…k, a…a, ti... |
| 643 | ɑ̃paʁat | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | french | 7 | [ɑ, ̃, p, a, ʁ, a, t] | [ɑ̃, ̃p, pa, aʁ, ʁa, at, ɑ, ̃, p, a, ʁ, a, t] | [ɑ̃p, ̃pa, paʁ, aʁa, ʁat, ɑ̃, ̃p, pa, aʁ, ʁa, ... | [ɑ̃, ɑ…p, ɑ…a, ɑ…ʁ, ɑ…t, ̃p, ̃…a, ̃…ʁ, ̃…t, pa... |
| 2905 | tʃɔkələts | 1 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | ... | 0 | 0 | 0 | 0 | english | 9 | [t, ʃ, ɔ, k, ə, l, ə, t, s] | [tʃ, ʃɔ, ɔk, kə, əl, lə, ət, ts, t, ʃ, ɔ, k, ə... | [tʃɔ, ʃɔk, ɔkə, kəl, ələ, lət, əts, tʃ, ʃɔ, ɔk... | [tʃ, t…ɔ, t…k, t…ə, t…l, t…t, t…s, ʃɔ, ʃ…k, ʃ…... |
5262 rows × 29 columns
In [26]:
## skippy 3grams の生成
#import sys
#sys.path.append("..") # library path に一つ上の階層を追加
import ngrams_skippy
skippy_3grams = [ ngrams_skippy.generate_skippy_trigrams(x,
missing_mark = '…',
max_distance = max_distance_val, check = False)
for x in df['1gram'] ]
## 包括的 skippy 3-grams の生成
if ngram_is_inclusive:
for i, t2 in enumerate(skippy_3grams):
t2.extend(skippy_2grams[i])
#
if verbose:
random.sample(skippy_3grams, 3)
In [27]:
## skippy 3gram 列の追加
df['skippy3gram'] = skippy_3grams
df
Out[27]:
| form | freq | spell | sound | arabic | bengali | chinese | english | esperanto | finnish | ... | spanish | swahili | turkish | language | size | 1gram | 2gram | 3gram | skippy2gram | skippy3gram | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 785 | korjente | 1.0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 1 | 0 | 0 | spanish | 8 | [k, o, r, j, e, n, t, e] | [ko, or, rj, je, en, nt, te, k, o, r, j, e, n,... | [kor, orj, rje, jen, ent, nte, ko, or, rj, je,... | [ko, k…r, k…j, k…e, k…n, k…t, or, o…j, o…e, o…... | [kor, ko…j, ko…e, ko…n, ko…t, k…rj, k…r…e, k…r... |
| 803 | pistola | 1.0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 1 | 0 | 0 | spanish | 7 | [p, i, s, t, o, l, a] | [pi, is, st, to, ol, la, p, i, s, t, o, l, a] | [pis, ist, sto, tol, ola, pi, is, st, to, ol, ... | [pi, p…s, p…t, p…o, p…l, p…a, is, i…t, i…o, i…... | [pis, pi…t, pi…o, pi…l, pi…a, p…st, p…s…o, p…s... |
| 2296 | saɪləns | 1 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | ... | 0 | 0 | 0 | english | 7 | [s, a, ɪ, l, ə, n, s] | [sa, aɪ, ɪl, lə, ən, ns, s, a, ɪ, l, ə, n, s] | [saɪ, aɪl, ɪlə, lən, əns, sa, aɪ, ɪl, lə, ən, ... | [sa, s…ɪ, s…l, s…ə, s…n, s…s, aɪ, a…l, a…ə, a…... | [saɪ, sa…l, sa…ə, sa…n, sa…s, s…ɪl, s…ɪ…ə, s…ɪ... |
| 833 | flɔkɔnɛʁ | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | french | 8 | [f, l, ɔ, k, ɔ, n, ɛ, ʁ] | [fl, lɔ, ɔk, kɔ, ɔn, nɛ, ɛʁ, f, l, ɔ, k, ɔ, n,... | [flɔ, lɔk, ɔkɔ, kɔn, ɔnɛ, nɛʁ, fl, lɔ, ɔk, kɔ,... | [fl, f…ɔ, f…k, f…n, f…ɛ, f…ʁ, lɔ, l…k, l…ɔ, l…... | [flɔ, fl…k, fl…ɔ, fl…n, fl…ɛ, fl…ʁ, f…ɔk, f…ɔ…... |
| 798 | aswafəʁɔ̃ | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | french | 9 | [a, s, w, a, f, ə, ʁ, ɔ, ̃] | [as, sw, wa, af, fə, əʁ, ʁɔ, ɔ̃, a, s, w, a, f... | [asw, swa, waf, afə, fəʁ, əʁɔ, ʁɔ̃, as, sw, wa... | [as, a…w, a…a, a…f, a…ə, a…ʁ, a…ɔ, a…̃, sw, s…... | [asw, as…a, as…f, as…ə, as…ʁ, as…ɔ, as…̃, a…wa... |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 2109 | pɛnɪ | 1 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | ... | 0 | 0 | 0 | english | 4 | [p, ɛ, n, ɪ] | [pɛ, ɛn, nɪ, p, ɛ, n, ɪ] | [pɛn, ɛnɪ, pɛ, ɛn, nɪ, p, ɛ, n, ɪ] | [pɛ, p…n, p…ɪ, ɛn, ɛ…ɪ, nɪ, p, ɛ, n, ɪ] | [pɛn, pɛ…ɪ, p…nɪ, ɛnɪ, pɛ, p…n, p…ɪ, ɛn, ɛ…ɪ, ... |
| 527 | aːlmaʕaːdin | 1 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | arabic | 11 | [a, ː, l, m, a, ʕ, a, ː, d, i, n] | [aː, ːl, lm, ma, aʕ, ʕa, aː, ːd, di, in, a, ː,... | [aːl, ːlm, lma, maʕ, aʕa, ʕaː, aːd, ːdi, din, ... | [aː, a…l, a…m, a…a, a…ʕ, a…ː, a…d, a…i, a…n, ː... | [aːl, aː…m, aː…a, aː…ʕ, aː…ː, aː…d, aː…i, a…lm... |
| 32 | katika | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 1 | 0 | swahili | 6 | [k, a, t, i, k, a] | [ka, at, ti, ik, ka, k, a, t, i, k, a] | [kat, ati, tik, ika, ka, at, ti, ik, ka, k, a,... | [ka, k…t, k…i, k…k, k…a, at, a…i, a…k, a…a, ti... | [kat, ka…i, ka…k, ka…a, k…ti, k…t…k, k…t…a, k…... |
| 643 | ɑ̃paʁat | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | french | 7 | [ɑ, ̃, p, a, ʁ, a, t] | [ɑ̃, ̃p, pa, aʁ, ʁa, at, ɑ, ̃, p, a, ʁ, a, t] | [ɑ̃p, ̃pa, paʁ, aʁa, ʁat, ɑ̃, ̃p, pa, aʁ, ʁa, ... | [ɑ̃, ɑ…p, ɑ…a, ɑ…ʁ, ɑ…t, ̃p, ̃…a, ̃…ʁ, ̃…t, pa... | [ɑ̃p, ɑ̃…a, ɑ̃…ʁ, ɑ̃…t, ɑ…pa, ɑ…p…ʁ, ɑ…p…a, ɑ…... |
| 2905 | tʃɔkələts | 1 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | ... | 0 | 0 | 0 | english | 9 | [t, ʃ, ɔ, k, ə, l, ə, t, s] | [tʃ, ʃɔ, ɔk, kə, əl, lə, ət, ts, t, ʃ, ɔ, k, ə... | [tʃɔ, ʃɔk, ɔkə, kəl, ələ, lət, əts, tʃ, ʃɔ, ɔk... | [tʃ, t…ɔ, t…k, t…ə, t…l, t…t, t…s, ʃɔ, ʃ…k, ʃ…... | [tʃɔ, tʃ…k, tʃ…ə, tʃ…l, tʃ…t, tʃ…s, t…ɔk, t…ɔ…... |
5262 rows × 30 columns
In [28]:
## LDA 構築の基になる document-term matrix (dtm) を構築
from gensim.corpora.dictionary import Dictionary
bots = df[term_type]
diction = Dictionary(bots)
## 結果の確認
print(diction)
Dictionary<60325 unique tokens: ['e', 'en', 'ent', 'en…e', 'e…e']...>
In [29]:
## diction の濾過
import copy
diction_copy = copy.deepcopy(diction)
## filter適用: 実は諸刃の刃で,token数が少ない時には適用しない方が良い
print(f"min freq filter: {term_min_freq}")
print(f"abuse filter: {term_abuse_threshold}")
apply_filter = True
if apply_filter:
diction_copy.filter_extremes(no_below = term_min_freq, no_above = term_abuse_threshold)
## check
print(diction_copy)
min freq filter: 2 abuse filter: 0.04 Dictionary<31740 unique tokens: ['en', 'ent', 'en…e', 'e…e', 'e…t']...>
In [30]:
## Corpus (gensim の用語では corpus) の構築
corpus = [ diction.doc2bow(bot) for bot in bots ]
## check
check = True
if verbose:
sample_n = 5
print(random.sample(corpus, sample_n))
#
print(f"Number of documents: {len(corpus)}")
Number of documents: 5262
In [31]:
## LDA モデルの構築
from gensim.models import LdaModel
#from tqdm import tqdm
## LDAモデル
print(f"Building LDA model with n_topics: {n_topics}")
lda = LdaModel(corpus, id2word = diction, num_topics = n_topics, alpha = 0.01)
#
print(lda) # print(..)しないと中身が見れない
Building LDA model with n_topics: 15 LdaModel<num_terms=60325, num_topics=15, decay=0.5, chunksize=2000>
In [32]:
%%capture --no-display
## LDA のtopic ごとに,関連度の高い term を表示
import pandas as pd
n_terms = 40 # topic ごとに表示する term 数の指定
topic_dfs = [ ]
for topic in range(n_topics):
terms = [ ]
for i, prob in lda.get_topic_terms(topic, topn = n_terms):
terms.append(diction.id2token[ int(i) ])
#
topic_dfs.append(pd.DataFrame([terms], index = [ f'topic {topic+1}' ]))
#
topic_term_df = pd.concat(topic_dfs)
## Table で表示
topic_term_df.T
Out[32]:
| topic 1 | topic 2 | topic 3 | topic 4 | topic 5 | topic 6 | topic 7 | topic 8 | topic 9 | topic 10 | topic 11 | topic 12 | topic 13 | topic 14 | topic 15 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | a | a | ɪ | a | a | t | i | n | a | a | o | e | a | o | a |
| 1 | ɔ | ː | m | e | e | n | t | a | m | k | a | a | ʁ | i | a…a |
| 2 | ̃ | l | a | ɾ | w | s | i…i | d | r | t | ð | t | e | e | e |
| 3 | ʁ | a…a | s | p | k | ɪ | m | na | a…a | u | t | u | i | n | k |
| 4 | ː | aː | n | s | l | ɛ | n | i | ː | i | a…o | ə | s | a | p |
| 5 | s | l…a | ɑ | i | wa | a | l | o | ma | s | ɛ | n | k | k | l |
| 6 | ɔ̃ | i | t | o | ̃ | ɹ | e | a…n | d | k…a | l | ɾ | ɛ | s | o |
| 7 | t | a…l | ə | m | a…a | ː | a | s | t | k…t | k | l | ə | r | s |
| 8 | ʁ…̃ | b | r | n | p | d | ɪ | t | u | ʁ | p…o | s | u | ː | ɪ |
| 9 | j | la | v | w | ɔ | l | s | u | l | u…a | ɔ | d | p | t | n |
| 10 | i | a…ː | i | ʃ | g | ə | ə | f | i | e | m | m | l | e…o | t |
| 11 | u | ː…a | l | a…a | ɓ | u | s…i | e | k | m | ə | f | z | ð | i |
| 12 | v | r | e | e…ɾ | i | i | a…i | ɹ | h | l | pa…o | ɹ | əʁ | m | m |
| 13 | ɛ | l…ː | ɛ | t | ɾ | o | m…i | no | a…ː | a…a | a…ð | ɪ | ʁa | n…o | k…a |
| 14 | b | aː…a | ̃ | j | j | j | ni | in | n | t…a | o…o | a…e | f | i…a | p…a |
| 15 | ʁ…ɔ | aːl | d | aɾ | ɓa | m | h | m…n | a…m | ka | e | j | ʁ…a | en | s…a |
| 16 | s…ː | ːl | k | e…a | β | ɪ…t | i…a | an | a…a…a | ɛ | k…le | i | t | i…o | a…e |
| 17 | ʁ…ɔ̃ | u | ɹ | p…ɾ | u | nt | ʃ | f…n | am | ku | ʁ | p | k…a | m…o | i…a |
| 18 | jɔ | s | ɑ̃ | k | ke | a…t | i…t | ɔ | aː | i…a | p | e…e | ʁ…ʁ | ː…a | ka |
| 19 | s…ɔ | a…i | n…ɹ | s…ɾ | ɑ̃ | st | t…i | n…s | a…l | ɾ | ɾ | r | ku | si | e…a |
| 20 | j…̃ | a…l…a | ɾ | eɾ | s | e | z | i…a | a…i | k…i | l…o | ð | a…i | iː | ɾ |
| 21 | l | ʔ | ː | ɗ | n | ɛ…t | ɾi | ina | m…ː | at | i | o | ɪ | j | a…i |
| 22 | p | a…r | ð | es | o | n…t | k | k | a…r | a…t | n | æ | a…a | ko | ta |
| 23 | jɔ̃ | m | ɔ | u | ʁ | ɛn | ɹ | ɡ | ʏ | ku…a | ː | ʏ | ɔ | o…o | a…o |
| 24 | e | i…a | k…n | ɾ…ɾ | a…e | t…t | is | a…a | ɪ | a…i | ːð | je | i…a | on | al |
| 25 | ə…̃ | ːl…a | əɹ | e…e | we | s…n | tɪ | s…n | i…a | s…t | o…a | z | n | i…n | ma |
| 26 | s…̃ | a…a…a | n…ə | p…a | g…a | f | il | un | s | f | ɪ | ː | e…a | l | θ |
| 27 | i…̃ | d | ɑ…t | d | t | s…t | ɾ | m | m…l | ʃ | k…t | ʃ | u…a | k…o | p…t |
| 28 | s…a | t | st | w…o | ᵑg | t…n | i…e | a…na | ha | k…k | j | t…e | t…ʁ | k…n | na |
| 29 | t…̃ | iː | m…̃ | ⁿ | ᵑ | r | mi | a…d | au | p | ðo | æ…ə | ʁɛ | θ | m…a |
| 30 | ə | a…aː | ɛ…ɪ | p…e | ɓ…i | v | in | j | d…a | k…u | m…o | k | ʁe | ða | an |
| 31 | a…ː | ba | ̃t | β | ̃t | ou | li | m…na | ʏr | e…a | g | t…a | r | o…e | u |
| 32 | ɑ | q | ə…ə | pa | k…a | ɔ | ɛ | a…ad | ama | ta | ða | əɹ | m | p | j |
| 33 | ʒ | aː…ː | m…ə | ɾ…a | ̃…ʁ | k | o | m…a | ʰ | k…t…a | a…ðo | ɛ | ː | nt | o…a |
| 34 | d | l…i | l…ħ | e…m | ɑ | d…t | it | k…n | ː…a | ik | ɓo | t…ə | a…s | m…e | l…a |
| 35 | ɑ̃ | ra | ɑ̃t | ɑ | e…e | h | ei | a…ːn | ʕ | ut | k…o | e…a | a…ʁ | aː | t…a |
| 36 | a…ħ | al | o | iɾ | ̃…a | ɪn | ɪ…ə | de | a…ma | z | k…ð | ʁ | i…ʁ | i…e | k…l |
| 37 | ː…a | ʔ…a | m…n | i…o | p…e | ta | i…i…a | d…ʁ | dʒ | t…k | k…l | b | ʊ | j…n | a…a…a |
| 38 | ə…ɔ | ː…ː | ɪ…ɪ | e…i | ɓ…ɗ | uː | f | d…a | ʒ | u…i | ɛ…ð | de | ku…a | a…i | n…a |
| 39 | uː | ʔa | k…ɪ | p…n | ɔ…a | ɜ | h…i | e…s | ra | ti | ko | u…a | s…ʁ | s…o | a…n |
In [33]:
%%capture --no-display
## pyLDAvis を使った結果 LDA の可視化: 階層クラスタリングより詳しい
import pyLDAvis
#installed_version = sys.version
installed_version = pyLDAvis.__version__
print(f"installed_version: {installed_version}")
if float(installed_version[:3]) > 3.1:
import pyLDAvis.gensim_models as gensimvis
else:
import pyLDAvis.gensim as gensimvis
#
pyLDAvis.enable_notebook()
#
lda_used = lda
corpus_used = corpus
diction_used = diction
## 実行パラメター
use_tSNE = False
if use_tSNE:
vis = gensimvis.prepare(lda_used, corpus_used, diction_used, mds = 'tsne',
n_jobs = 1, sort_topics = False)
else:
vis = gensimvis.prepare(lda_used, corpus_used, diction_used,
n_jobs = 1, sort_topics = False)
#
pyLDAvis.display(vis)
## topic を表わす円の重なりが多いならn_topics が多過ぎる可能性がある.
## ただし2Dで重なっていても,3Dなら重なっていない可能性もある
Out[33]:
In [34]:
## LDA がD に対して生成した topics の弁別性を確認
## 得られたtopics を確認
topic_dist = lda.get_topics()
if verbose:
topic_dist
In [35]:
## 検査 1: topic ごとに分布の和を取る
print(topic_dist.sum(axis = 1))
[1.0000001 1. 1. 1. 1. 0.9999999 1.0000001 1. 1. 1. 0.99999994 0.9999999 1.0000001 1. 1. ]
In [36]:
## 検査 2: 総和を求める: n_topics にほぼ等しいなら正常
print(topic_dist.sum())
15.000007
In [37]:
## term エンコード値の分布を確認
import matplotlib.pyplot as plt
plt.figure(figsize = (4,5))
sampling_rate = 0.3
df_size = len(topic_dist)
sample_n = round(df_size * sampling_rate)
topic_sampled = random.sample(list(topic_dist), sample_n)
T = sorted([ sorted(x, reverse = True) for x in topic_sampled ])
plt.plot(T, range(len(T)))
plt.title("Distribution of sorted values ({sample_n} samples) for topic/term encoding")
plt.show()
In [38]:
## tSNE を使った topics のグループ化 (3D)
from sklearn.manifold import TSNE
import numpy as np
## tSNE のパラメターを設定
## n_components は射影先の空間の次元: n_components = 3 なら3次元空間に射影
## perplexity は結合の強さを表わす指数で,値に拠って結果が代わるので,色々な値を試すと良い
#perplexity_val = 10 # 大き過ぎると良くない
top_perplexity_reduct_rate = 0.3
perplexity_val = round(len(topic_dist) * top_perplexity_reduct_rate)
topic_tSNE_3d = TSNE(n_components = 3, random_state = 0, perplexity = perplexity_val, n_iter = 1000)
## データに適用
top_tSNE_3d_fitted = topic_tSNE_3d.fit_transform(np.array(topic_dist))
/Users/kowk/opt/anaconda3/lib/python3.9/site-packages/threadpoolctl.py:1010: RuntimeWarning:
Found Intel OpenMP ('libiomp') and LLVM OpenMP ('libomp') loaded at
the same time. Both libraries are known to be incompatible and this
can cause random crashes or deadlocks on Linux when loaded in the
same Python program.
Using threadpoolctl may cause crashes or deadlocks. For more
information and possible workarounds, please see
https://github.com/joblib/threadpoolctl/blob/master/multiple_openmp.md
warnings.warn(msg, RuntimeWarning)
In [39]:
## Plotlyを使って tSNE の結果の可視化 (3D)
#import plotly.express as pex
import plotly.graph_objects as go
import numpy as np
top_tSNE = top_tSNE_3d_fitted
fig = go.Figure(data = [go.Scatter3d(x = top_tSNE[:,0], y = top_tSNE[:,1], z = top_tSNE[:,2],
mode = 'markers')])
## 3D 散布図にラベルを追加する処理は未実装
title_val = f"3D tSNE view for LDA (#topics: {n_topics}, doc: {doc_type}, term: {term_type})"
fig.update_layout(autosize = False,
width = 600, height = 600, title = title_val)
fig.show()
/Users/kowk/opt/anaconda3/lib/python3.9/site-packages/plotly/io/_renderers.py:395: DeprecationWarning: distutils Version classes are deprecated. Use packaging.version instead. /Users/kowk/opt/anaconda3/lib/python3.9/site-packages/plotly/io/_renderers.py:395: DeprecationWarning: distutils Version classes are deprecated. Use packaging.version instead.
In [40]:
## 構築した LDA モデルを使って文(書)を分類する
## .get_document_topics(..) は minimu_probability = 0としないと
## topic の値が小さい場合に値を返さないので,
## パラメター
ntopics = n_topics # LDA の構築の最に指定した値を使う
check = False
encoding = [ ]
for i, row in df.iterrows():
if check:
print(f"row: {row}")
doc = row[doc_type]
bot = row[term_type]
## get_document_topics(..) では minimu_probability = 0 としないと
## 値が十分に大きな topics に関してだけ値が取れる
enc = lda.get_document_topics(diction.doc2bow(bot), minimum_probability = 0)
if check:
print(f"enc: {enc}")
encoding.append(enc)
#
len(encoding)
Out[40]:
5262
In [41]:
## enc 列の追加
#df['enc'] = np.array(encoding) # This flattens arrays
#df['enc'] = list(encoding) # ineffective
df['enc'] = [ list(map(lambda x: x[1], y)) for y in encoding ]
if verbose:
df['enc']
In [42]:
## エンコーディングのstd の分布を見る
from scipy.stats import tstd
from matplotlib import pyplot as plt
plt.figure(figsize = (6,4))
std_data = [ tstd(x) for x in df['enc'] ]
plt.hist(std_data)
plt.title("Distribution of standard deviations")
plt.show()
In [43]:
## doc のエンコーディング
## 一様分布の事例を除外
from scipy.stats import tstd # standard deviation の計算用
print(f"{len(df)} instances before filtering")
check = False
doc_enc = df['enc']
max_std = max([ tstd(x) for x in doc_enc])
if check: print(f"std max: {max_std}")
min_std = min([ tstd(x) for x in doc_enc])
if check: print(f"std min: {min_std}")
first_min_std = list(sorted(set([ tstd(x) for x in doc_enc])))[-0]
print(f"std 1st min: {first_min_std}")
second_min_std = list(sorted(set([ tstd(x) for x in doc_enc])))[-1]
print(f"std 2nd min: {second_min_std}")
5262 instances before filtering std 1st min: 0.07048820690095535 std 2nd min: 0.2580037575971185
In [44]:
## df_filtered の定義
## 閾値は2番目に小さい値より小さく最小値よりは大きな値であるべき
std_threshold = second_min_std / 4 # 穏健な値を得るために4で割った
print(f"std_threshold: {std_threshold}")
## Rっぽい次のコードは通らない
#df_filtered = df[ df['encoding'] > std_threshold ]
## 通るのは次のコード: Creating a list of True/False and apply it to DataFrame
std_tested = [ False if tstd(x) < std_threshold else True for x in df['enc'] ]
df_filtered = df[ std_tested ]
#
print(f"{len(df_filtered)} instances after filtering ({len(df) - len(df_filtered)} instances removed)")
std_threshold: 0.06450093939927963 5262 instances after filtering (0 instances removed)
In [45]:
## doc エンコード値の分布を確認
sample_n = 50
E = sorted([ sorted(x, reverse = True) for x in df_filtered['enc'].sample(sample_n) ])
plt.figure(figsize = (5,5))
plt.plot(E, range(len(E)))
plt.title(f"Distribution of sorted encoding values for sampled {sample_n} docs")
plt.show()
In [46]:
len(df_filtered['language'])
Out[46]:
5262
In [47]:
df_filtered['language'].value_counts
Out[47]:
<bound method IndexOpsMixin.value_counts of 785 spanish
803 spanish
2296 english
833 french
798 french
...
2109 english
527 arabic
32 swahili
643 french
2905 english
Name: language, Length: 5262, dtype: object>
In [48]:
## tSNE 用の事例サンプリング = tSNE_df の定義
tSNE_sampling = True
tSNE_sampling_rate = 0.33
if tSNE_sampling:
tSNE_df_original = df_filtered.copy()
sample_n = round(len(tSNE_df_original) * tSNE_sampling_rate)
tSNE_df = tSNE_df_original.sample(sample_n)
print(f"tSNE_df has {len(tSNE_df)} rows after sampling")
else:
tSNE_df = df_filtered
tSNE_df has 1736 rows after sampling
In [49]:
tSNE_df.columns
Out[49]:
Index(['form', 'freq', 'spell', 'sound', 'arabic', 'bengali', 'chinese',
'english', 'esperanto', 'finnish', 'french', 'greek', 'galician',
'german', 'hungarian', 'icelandic', 'irish', 'italian', 'japanese',
'russian', 'spanish', 'swahili', 'turkish', 'language', 'size', '1gram',
'2gram', '3gram', 'skippy2gram', 'skippy3gram', 'enc'],
dtype='object')
In [50]:
tSNE_df['language'].value_counts
Out[50]:
<bound method IndexOpsMixin.value_counts of 12 french
211 arabic
341 french
892 french
648 spanish
...
315 swahili
211 swahili
808 spanish
489 french
674 spanish
Name: language, Length: 1736, dtype: object>
In [51]:
## tSNE の結果の可視化: Plotly を使った 3D 描画
import numpy as np
from sklearn.manifold import TSNE as tSNE
import plotly.express as pex
import plotly.graph_objects as go
import matplotlib.pyplot as plt
## tSNE のパラメターを設定
perplexity_max_val = round(len(tSNE_df)/4)
for perplexity_val in range(5, perplexity_max_val, 60):
## tSNE 事例の生成
tSNE_3d_varied = tSNE(n_components = 3, random_state = 0, perplexity = perplexity_val, n_iter = 1000)
## データに適用
doc_enc = np.array(list(tSNE_df['enc']))
doc_tSNE_3d_varied = tSNE_3d_varied.fit_transform(doc_enc)
T = zip(doc_tSNE_3d_varied[:,0], doc_tSNE_3d_varied[:,1], doc_tSNE_3d_varied[:,2],
tSNE_df['language']) # zip(..)が必要
df = pd.DataFrame(T, columns = ['D1', 'D2', 'D3', 'language'])
## 作図
fig = go.Figure()
for lang in np.unique(df['language']):
part = df[df['language'] == lang]
fig.add_trace(
go.Scatter3d(
x = part['D1'], y = part['D2'], z = part['D3'],
name = lang, mode = 'markers', marker = dict(size = 4),
showlegend = True
)
)
title_val = f"tSNE 3D map (ppl: {perplexity_val}) of '{doc_attr}'s encoded\n by LDA ({n_topics} topics, {term_type})"
fig.update_layout(title = dict(text = title_val),
autosize = False, width = 600, height = 600,)
fig.show()
/Users/kowk/opt/anaconda3/lib/python3.9/site-packages/plotly/express/imshow_utils.py:24: DeprecationWarning: `np.bool8` is a deprecated alias for `np.bool_`. (Deprecated NumPy 1.24) /Users/kowk/opt/anaconda3/lib/python3.9/site-packages/plotly/io/_renderers.py:395: DeprecationWarning: distutils Version classes are deprecated. Use packaging.version instead. /Users/kowk/opt/anaconda3/lib/python3.9/site-packages/plotly/io/_renderers.py:395: DeprecationWarning: distutils Version classes are deprecated. Use packaging.version instead.
/Users/kowk/opt/anaconda3/lib/python3.9/site-packages/plotly/io/_renderers.py:395: DeprecationWarning: distutils Version classes are deprecated. Use packaging.version instead. /Users/kowk/opt/anaconda3/lib/python3.9/site-packages/plotly/io/_renderers.py:395: DeprecationWarning: distutils Version classes are deprecated. Use packaging.version instead.
/Users/kowk/opt/anaconda3/lib/python3.9/site-packages/plotly/io/_renderers.py:395: DeprecationWarning: distutils Version classes are deprecated. Use packaging.version instead. /Users/kowk/opt/anaconda3/lib/python3.9/site-packages/plotly/io/_renderers.py:395: DeprecationWarning: distutils Version classes are deprecated. Use packaging.version instead.
/Users/kowk/opt/anaconda3/lib/python3.9/site-packages/plotly/io/_renderers.py:395: DeprecationWarning: distutils Version classes are deprecated. Use packaging.version instead. /Users/kowk/opt/anaconda3/lib/python3.9/site-packages/plotly/io/_renderers.py:395: DeprecationWarning: distutils Version classes are deprecated. Use packaging.version instead.
/Users/kowk/opt/anaconda3/lib/python3.9/site-packages/plotly/io/_renderers.py:395: DeprecationWarning: distutils Version classes are deprecated. Use packaging.version instead. /Users/kowk/opt/anaconda3/lib/python3.9/site-packages/plotly/io/_renderers.py:395: DeprecationWarning: distutils Version classes are deprecated. Use packaging.version instead.
/Users/kowk/opt/anaconda3/lib/python3.9/site-packages/plotly/io/_renderers.py:395: DeprecationWarning: distutils Version classes are deprecated. Use packaging.version instead. /Users/kowk/opt/anaconda3/lib/python3.9/site-packages/plotly/io/_renderers.py:395: DeprecationWarning: distutils Version classes are deprecated. Use packaging.version instead.
/Users/kowk/opt/anaconda3/lib/python3.9/site-packages/plotly/io/_renderers.py:395: DeprecationWarning: distutils Version classes are deprecated. Use packaging.version instead. /Users/kowk/opt/anaconda3/lib/python3.9/site-packages/plotly/io/_renderers.py:395: DeprecationWarning: distutils Version classes are deprecated. Use packaging.version instead.
/Users/kowk/opt/anaconda3/lib/python3.9/site-packages/plotly/io/_renderers.py:395: DeprecationWarning: distutils Version classes are deprecated. Use packaging.version instead. /Users/kowk/opt/anaconda3/lib/python3.9/site-packages/plotly/io/_renderers.py:395: DeprecationWarning: distutils Version classes are deprecated. Use packaging.version instead.
In [52]:
## 階層クラスタリングのための事例のサンプリング
hc_sampling_rate = 0.1 # 大きくし過ぎると図が見にくい
df_size = len(tSNE_df)
hc_sample_n = round(df_size * hc_sampling_rate)
hc_df = tSNE_df.sample(hc_sample_n)
##
print(f"{hc_sample_n} rows are sampled")
hc_df['language'].value_counts()
174 rows are sampled
Out[52]:
language spanish 34 french 31 english 30 swahili 28 arabic 28 icelandic 23 Name: count, dtype: int64
In [53]:
## 日本語表示のための設定
#import matplotlib.pyplot as plt
#plt.rcParams["font.family"] = "Hiragino Sans" # Windows は別のフォント名を指定する必要がある
#plt.rcParams["font.family"] = "Lucida Sans Unicode"
In [54]:
## doc 階層クラスタリングの実行
import numpy as np
import plotly
import matplotlib.pyplot as plt
## 次の設定は arabic が文字化けする
#plt.rcParams["font.family"] = "Hiragino Sans" # Windows は別のフォント名を指定する必要がある
#plt.rcParams["font.family"] = "Lucida Sans Unicode"
from scipy.cluster.hierarchy import dendrogram, linkage
## 距離行列の生成
Enc = list(hc_df['enc'])
linkage = linkage(Enc, method = 'ward', metric = 'euclidean')
## 描画サイズの指定
plt.figure(figsize = (5, round(len(hc_df) * 0.15))) # This needs to be run here, before dendrogram construction.
## 事例ラベルの生成
label_vals = [ x[:max_doc_size] for x in list(hc_df[doc_type]) ] # truncate doc keys
## 樹状分岐図の作成
dendrogram(linkage, orientation = 'left', labels = label_vals, leaf_font_size = 7)
## 描画
plt.title(f"Hierarchical clustering of (sampled) {len(hc_df)} (= {100 * hc_sampling_rate}%) {doc_attr}s as docs\n \
encoded via LDA ({n_topics} topics) with {term_type} as terms")
## ラベルに language に対応する色を付ける
lang_colors = { lang_name : i for i, lang_name in enumerate(np.unique(hc_df['language'])) }
ax = plt.gca()
for ticker in ax.get_ymajorticklabels():
form = ticker.get_text()
row = hc_df.loc[hc_df[doc_type] == form]
#lang = row['language']
lang = row['language'].to_string().split()[-1] # trick
try:
lang_id = lang_colors[lang]
except (TypeError, KeyError):
print(f"color encoding error at: {lang}")
#
ticker.set_color(plotly.colors.qualitative.Plotly[lang_id]) # id の基数調整
#
plt.show()
In [57]:
## tSNE の結果の可視化 (2D)
#import seaborn as sns
import numpy as np
import plotly
import plotly.express as pex
import matplotlib.pyplot as plt
from adjustText import adjust_text
## tSNE 事例の生成
perplexity_selected = 150
tSNE_3d = tSNE(n_components = 3, random_state = 0, perplexity = perplexity_selected, n_iter = 1000)
## データに適用
doc_enc = np.array(list(tSNE_df['enc']))
doc_tSNE_3d = tSNE_3d.fit_transform(doc_enc)
T = zip(doc_tSNE_3d[:,0], doc_tSNE_3d[:,1], doc_tSNE_3d[:,2],
tSNE_df['language']) # zip(..)が必要
df = pd.DataFrame(T, columns = ['D1', 'D2', 'D3', 'language'])
## 描画
## 次の設定は arabic が文字化けする
#plt.rcParams["font.family"] = "Hiragino Sans" # Windows は別のフォント名を指定する必要がある
#plt.rcParams["font.family"] = "Lucida Sans Unicode"
plt.figure(figsize = (5, 5))
plt.set_colors = pex.colors.qualitative.Plotly
for r in [ np.roll([0,1,2], -i) for i in range(0,3) ]:
if check:
print(r)
X, Y = df.iloc[:,r[0]], df.iloc[:,r[1]]
gmax = max(X.max(), Y.max())
gmin = min(X.min(), Y.min())
plt.xlim(gmin, gmax)
plt.ylim(gmin, gmax)
colormap = pex.colors.qualitative.Plotly
lang_list = list(np.unique(tSNE_df['language']))
cmapped = [ colormap[lang_list.index(lang)] for lang in df['language'] ]
scatter = plt.scatter(X, Y, s = 40, c = cmapped, edgecolors = 'w')
## 文字を表示する事例のサンプリング
lab_sampling_rate = 0.02
lab_sample_n = round(len(tSNE_df) * lab_sampling_rate)
sampled_keys = [ doc[:max_doc_size] for doc in random.sample(list(tSNE_df[doc_type]), lab_sample_n) ]
## labels の生成
texts = [ ]
for x, y, s in zip(X, Y, sampled_keys):
texts.append(plt.text(x, y, s, size = 9, color = 'blue'))
## label に repel を追加: adjustText package の導入が必要
adjust_text(texts, force_points = 0.2, force_text = 0.2,
expand_points = (1, 1), expand_text = (1, 1),
arrowprops = dict(arrowstyle = "-", color = 'black', lw = 0.5))
#
plt.title(f"tSNE (ppl: {perplexity_selected}) 2D map of {len(tSNE_df)} {doc_attr}s via LDA ({term_type}; {n_topics} topics)")
#plt.legend(cmapped)
plt.legend(df['language'])
plt.show()
In [ ]: